Gender bias in audience of seminars and career position
Data
EcoEncontros Seminar talks
Talks from EcoEncontros Seminar series at the Graduate Program of Ecology in the University of SĂŁo Paulo (PPGE-USP), Brazil
See file metadata.txt, in folder data for more description and detail of the dataset.
data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date)
#skimr::skim(data)Excluding special events as round tables and discussions not related to a project or study presented by someone.
IDs <- c(154, 250, 211, 289)
data <- data %>% filter(!id %in% IDs)For this specific analysis, excluding speakers that are not in academia (âothersâ), and keeping undergraduate students, MD and PhD in the group student. postdoc, professor or researcher*.
*Researchers are included in the professor categorical position (column position_cat) because all of them come from research institutions.
data <- data %>% filter(position_cat != "others")
data$position_cat <- fct_relevel(data$position_cat, "student",
"postdoc","professor")Excluding seminars with more than one speaker
events <- data %>% count(id) %>% filter(n>1)
data <- data %>% filter(!id %in% events$id,
!is.na(audience_n))dim(data)## [1] 299 30
Data description
Audience by gender and academic position
ggplot(data, aes(x=position_cat, y=audience_n, fill=gender)) +
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
geom_boxplot() #geom_violin(position = position_dodge(0.8)) +
#geom_jitter(position=position_jitterdodge(0.2),shape=21)library(ggbeeswarm)
# outra opção
ggplot(data, aes(x=position_cat, y=audience_n, fill=gender)) +
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
geom_violin(col="black") +
geom_quasirandom(dodge.width = 0.9, shape=21)+
stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar',
width=0.8, size=0.8, position = position_dodge(width = 0.9))+
xlab("") + ylab("Audience (N)")Variation in time
ggplot(data, aes(x=date, y=audience_n, fill=gender)) +
facet_wrap(~position_cat, ncol=1)+
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
geom_quasirandom(dodge.width = 0.9, shape=21)+
geom_smooth()+
xlab("") + ylab("Audience (N)")Looking for possible biases for speakers from inside and outside PPGE.
data$ppge <- ifelse(data$origin == "IB", "inside", "outside")
table(data$gender,data$ppge)##
## inside outside
## F 76 49
## M 79 95
ggplot(data, aes(x=ppge, y=audience_n, fill=gender)) +
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
geom_violin(col="black") +
geom_quasirandom(dodge.width = 0.9, shape=21)+
stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar',
width=0.8, size=0.8, position = position_dodge(width = 0.9))+
xlab("PPGE") + ylab("Audience (N)")Looking for possible biases for speakers from Brazil and abroad.
data$brazilian <- ifelse(data$country == "Brasil", "yes", "no")
table(data$gender,data$brazilian)##
## no yes
## F 22 103
## M 50 124
ggplot(data, aes(x=brazilian, y=audience_n, fill=gender)) +
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
geom_violin(col="black") +
geom_quasirandom(dodge.width = 0.9, shape=21)+
stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar',
width=0.8, size=0.8, position = position_dodge(width = 0.9))+
xlab("Brazilian") + ylab("Audience (N)")Modeling
Negative binomial
data$affirm_action <- ifelse(data$year<2018,"before", "after")mg0 <- glm.nb(audience_n~ 1, data=data)
mg1 <- glm.nb(audience_n~ gender, data=data)
mg2 <- glm.nb(audience_n~ position_cat, data=data)
mg3 <- glm.nb(audience_n~ year, data=data)
mg3b <- glm.nb(audience_n~ affirm_action, data=data)
mg4 <- glm.nb(audience_n~ gender + position_cat, data=data)
mg5 <- glm.nb(audience_n~ gender + year, data=data)
mg5b <- glm.nb(audience_n~ gender + affirm_action, data=data)
mg6 <- glm.nb(audience_n~ year + position_cat, data=data)
mg6b <- glm.nb(audience_n~ affirm_action + position_cat, data=data)
mg7 <- glm.nb(audience_n~ gender*position_cat, data=data)
mg8 <- glm.nb(audience_n~ gender*year, data=data)
mg8b <- glm.nb(audience_n~ gender*affirm_action, data=data)
mg9 <- glm.nb(audience_n~ year*position_cat, data=data)
mg9b <- glm.nb(audience_n~ affirm_action*position_cat, data=data)
mg10 <- glm.nb(audience_n~ gender + position_cat + year, data=data)
mg10b <- glm.nb(audience_n~ gender + position_cat + affirm_action, data=data)
mg11 <- glm.nb(audience_n~ gender*position_cat + year, data=data)
mg11b <- glm.nb(audience_n~ gender*position_cat + affirm_action, data=data)
mg12 <- glm.nb(audience_n~ gender + position_cat * year, data=data)
mg12b <- glm.nb(audience_n~ gender + position_cat * affirm_action, data=data)
mg13 <- glm.nb(audience_n~ gender*year + position_cat, data=data)
mg13b <- glm.nb(audience_n~ gender*affirm_action + position_cat, data=data)
mg14 <- glm.nb(audience_n~ gender*position_cat*year, data=data)
mg14b <- glm.nb(audience_n~ gender*position_cat*affirm_action, data=data)
AICtab(mg2, mg0,mg1, mg3, mg4,mg5,mg6,mg7,mg8,mg9,mg10,mg11,mg12,mg13,mg14,
mg3b,mg5b,mg6b,mg8b,mg9b,mg10b,mg11b,mg12b,mg13b, mg14b)## dAIC df
## mg11b 0.0 8
## mg10b 1.2 6
## mg13b 3.2 7
## mg12b 4.4 8
## mg11 5.7 8
## mg10 6.1 6
## mg6b 6.8 5
## mg14b 7.6 13
## mg7 7.7 7
## mg13 8.1 7
## mg4 8.2 5
## mg12 9.8 8
## mg9b 10.5 7
## mg6 10.9 5
## mg2 13.0 4
## mg14 14.0 13
## mg9 14.2 7
## mg5b 34.1 4
## mg8b 35.8 5
## mg5 37.0 4
## mg1 38.1 3
## mg8 39.0 5
## mg3b 51.1 3
## mg3 52.6 3
## mg0 53.4 2
Residual diagnostic
hnp::hnp(mg11b)## Negative binomial model (using MASS package)
plot(simulateResiduals(mg11b))hnp::hnp(mg10b)## Negative binomial model (using MASS package)
plot(simulateResiduals(mg10b))Models result
summary(mg11b)##
## Call:
## glm.nb(formula = audience_n ~ gender * position_cat + affirm_action,
## data = data, init.theta = 6.652451909, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8934 -0.7775 -0.1330 0.4896 3.8037
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.00465 0.06992 42.970 < 2e-16 ***
## genderM 0.11625 0.07453 1.560 0.11882
## position_catpostdoc 0.11708 0.10696 1.095 0.27372
## position_catprofessor 0.20059 0.10432 1.923 0.05449 .
## affirm_actionbefore -0.19567 0.06316 -3.098 0.00195 **
## genderM:position_catpostdoc -0.12030 0.14428 -0.834 0.40440
## genderM:position_catprofessor 0.22897 0.12861 1.780 0.07503 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(6.6525) family taken to be 1)
##
## Null deviance: 376.12 on 298 degrees of freedom
## Residual deviance: 303.02 on 292 degrees of freedom
## AIC: 2168.1
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 6.652
## Std. Err.: 0.708
##
## 2 x log-likelihood: -2152.067
performance::r2(mg11b)## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.303
myg11b <- ggpredict(mg11b, terms=c("position_cat","gender", "affirm_action"))
plot(myg11b) +
scale_color_manual(values = c("#b2abd2", "#fdb863"))summary(mg10b)##
## Call:
## glm.nb(formula = audience_n ~ gender + position_cat + affirm_action,
## data = data, init.theta = 6.502702402, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8101 -0.7831 -0.1381 0.4704 3.9188
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.98211 0.06352 46.947 < 2e-16 ***
## genderM 0.15356 0.05463 2.811 0.00494 **
## position_catpostdoc 0.04418 0.07250 0.609 0.54229
## position_catprofessor 0.36571 0.06046 6.049 1.46e-09 ***
## affirm_actionbefore -0.18918 0.06316 -2.995 0.00274 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(6.5027) family taken to be 1)
##
## Null deviance: 369.75 on 298 degrees of freedom
## Residual deviance: 303.07 on 294 degrees of freedom
## AIC: 2169.3
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 6.503
## Std. Err.: 0.688
##
## 2 x log-likelihood: -2157.260
performance::r2(mg10b)## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.282
myg10b <- ggpredict(mg10b, terms=c("position_cat","gender", "affirm_action"))
plot(myg10b) +
scale_color_manual(values = c("#b2abd2", "#fdb863"))Only professors - productivity metrics
Investigating if differences in productivity between male and female professors and researches are related to the audience.
Measured productivity publication metrics from Google Scholar for professors and researchers.
Creating productivity index using PCA 1st axis from metrics.
PCA productivity metrics
dp <- data %>% filter(!is.na(data$total_citation_n),
!is.na(data$nature_index_count))
table(dp$gender)##
## F M
## 20 67
Productivity publication metrics
pca1 <- PCA(dp[, c(21:28)], graph=F)
p1 <- fviz_pca_biplot(pca1, col.ind = dp$gender, addEllipses=TRUE,
col.ind.sub="none", geom="point",
repel = TRUE) +
scale_color_manual(name="gender",values = c("#b2abd2", "#fdb863"))+
scale_fill_manual(name="gender",values = c("#b2abd2", "#fdb863"))
p1Extracting PCA 2 first axes
dp$pc1 <- pca1$ind$coord[,1]
dp$pc2 <- pca1$ind$coord[,2]Modeling
m0 <- glm.nb(audience_n ~ 1, data=dp)
m1 <- glm.nb(audience_n ~ gender, data=dp)
m2 <- glm.nb(audience_n ~ pc1 + pc2, data=dp)
m3 <- glm.nb(audience_n ~ gender + pc1 + pc2, data=dp)
m4 <- glm.nb(audience_n ~ gender*pc1 + gender*pc2, data=dp)
AICtab(m0,m1,m2,m3,m4, base=T, weights=T)## AIC dAIC df weight
## m3 693.4 0.0 5 0.550
## m2 695.5 2.2 4 0.186
## m4 695.7 2.4 7 0.168
## m1 697.2 3.8 3 0.082
## m0 700.7 7.4 2 0.014
Residual diagnostic
Best model
hnp(m3)## Negative binomial model (using MASS package)
plot(simulateResiduals(m3)) ## Model results
summary(m3)##
## Call:
## glm.nb(formula = audience_n ~ gender + pc1 + pc2, data = dp,
## init.theta = 5.219733971, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5233 -0.7225 -0.2033 0.4610 3.3874
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.11594 0.11007 28.310 <2e-16 ***
## genderM 0.26225 0.12520 2.095 0.0362 *
## pc1 0.07478 0.02511 2.978 0.0029 **
## pc2 -0.01698 0.03941 -0.431 0.6665
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(5.2197) family taken to be 1)
##
## Null deviance: 103.589 on 86 degrees of freedom
## Residual deviance: 89.206 on 83 degrees of freedom
## AIC: 693.36
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 5.220
## Std. Err.: 0.927
##
## 2 x log-likelihood: -683.359
performance::r2(mg3)## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.015
my3 <- ggpredict(m3, terms=c("pc1","gender"))
plot(my3) +
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
theme_cowplot() + ggtitle("") +
ylab("Audience (N)") + xlab("Productivity (PC1 axis)")plot(my3) +
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
theme_cowplot() + ggtitle("") +
ylab("Audience (N)") + xlab("Productivity (PC1 axis)")my3 <- ggpredict(m3, terms=c("pc2","gender"))
plot(my3) +
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
theme_cowplot()